This Rmarkdonwn document can be used to reproduce the panels in figure 3 of the manuscript: huva: human variation in gene expression as surrogate for gene function. To run this script without any problem of dependencies or conflicts in the installation we suggest to use the docker image we provide (see README file).
In this figure we evaluate the robustness of the huva methods. We will perform here a series of experiment to show the robustness and valitidy of our approach.
library(huva)
library(huva.db)
library(ggplot2)
library(tidyverse)
library(reshape2)
library(limma)
library(fgsea)
library(Rmisc)
library(ggpubr)
library(pheatmap)
library(viridis)
library(foreach)
library(ggsci)
source("./source/costum_functions.R")
set.seed(91)
Part of the simulation executed here is calculated in parallel to lower the computational time, this is not necessary, the user can decide to activate or not this feature, changing the parameter ‘par_cal’ to TRUE
par_cal <- FALSE
if (par_cal == TRUE) {
if (require(doMC) == F) {
BiocManager::install("doMC", version = "3.12", update = F)
}
library(doMC)
# Devine the number of workers for the parallel computing
workers <- 4
}
We here wannt to compare the result of an huva experiment, where the comparison if between two groups characterized by low or high expression of a gene of interest, and a random selection of samples in the two experimental groups. We defined the run_huva_experiment_random_sample function which does not select the samples based on the expression of the gene of interest but only randomize the two experimental groups of equal size to the original huva groups. Here for example a randomized huva experiment was perfomed of CRELD1 giving almost no difference in the expression of the GOI (boxplot).
binned_random <- run_huva_experiment_random_sample(data = huva.db,
gene = "CRELD1",
quantiles = 0.10,
gs_list = hallmarks_V7.2,
summ = T,
datasets_list = c("FG500"),
adjust.method = "BH")
## [1] "Binning on CRELD1 expression"
plot_binned_gene(goi = "CRELD1", huva_experiment = binned_random)$PBMC + theme_bw() +theme(aspect.ratio = 2) +
expand_limits(y = 7) + ggtitle("Randomized huva experiment")
select the number of permutation in the experiment, for simplicity I selected the same number of samples in the dataset
n.random <- round(dim(huva.db$FG500$data$PBMC)[2])
I will now compare the result of the real huva experiment with the the result of the randomly selected samples. This same analysis will be performed with both a gene showing high and low variance across the dataset.
variance <- RowVar(huva.db$FG500$data$PBMC)
variance <- variance[order(variance, decreasing = T)]
# Container for the results
random_low <- matrix(nrow = n.random+1, ncol = 3)
colnames(random_low) <- c("logFC", "p.val", "n.DEgenes")
# The randomized experiment
for (i in 1:n.random) {
binned_random <- run_huva_experiment_random_sample(data = huva.db,
gene = "CRELD1",
quantiles = 0.10,
gs_list = hallmarks_V7.2,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
logFC <- binned_random$FG500$DE_genes$PBMC["CRELD1",]$logFC
p.val <- binned_random$FG500$DE_genes$PBMC["CRELD1",]$P.Value
n.DEgenes <- sum(binned_random$FG500$DE_genes$PBMC$adj.P.Val <=0.05)
random_low[i,] <- c(logFC, p.val, n.DEgenes)
}
# The original experiment
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "CRELD1",
quantiles = 0.10,
gs_list = hallmarks_V7.2,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
logFC <- binned_dataset$FG500$DE_genes$FG500_PBMC["CRELD1",]$logFC
p.val <- binned_dataset$FG500$DE_genes$FG500_PBMC["CRELD1",]$P.Value
n.DEgenes <- sum(binned_dataset$FG500$DE_genes$FG500_PBMC$adj.P.Val <=0.05)
# Preparation of the table
random_low[n.random+1,] <- c(logFC, p.val, n.DEgenes)
random_low <- as.data.frame(random_low)
random_low$type <- "random"
random_low$type[dim(random_low)[1]] <- "real"
random_low$absFC <- abs(random_low$logFC)
random_low$neglogpval <- -log10(random_low$p.val)
# Container for the results
random_high <- matrix(nrow = n.random+1, ncol = 3)
colnames(random_high) <- c("logFC", "p.val", "n.DEgenes")
# The randomized experiment
for (i in 1:n.random) {
binned_random <- run_huva_experiment_random_sample(data = huva.db,
gene = "SLC12A1",
quantiles = 0.10,
gs_list = hallmarks_V7.2,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
logFC <- binned_random$FG500$DE_genes$PBMC["SLC12A1",]$logFC
p.val <- binned_random$FG500$DE_genes$PBMC["SLC12A1",]$P.Value
n.DEgenes <- sum(binned_random$FG500$DE_genes$PBMC$adj.P.Val <=0.05)
random_high[i,] <- c(logFC, p.val, n.DEgenes)
}
# The original experiment
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "SLC12A1",
quantiles = 0.10,
gs_list = hallmarks_V7.2,
summ = T,
datasets_list = c("FG500"),
adjust.method = "BH")
logFC <- binned_dataset$FG500$DE_genes$FG500_PBMC["SLC12A1",]$logFC
p.val <- binned_dataset$FG500$DE_genes$FG500_PBMC["SLC12A1",]$P.Value
n.DEgenes <- sum(binned_dataset$FG500$DE_genes$FG500_PBMC$adj.P.Val <=0.05)
# Preparation of the table
random_high[n.random+1,] <- c(logFC, p.val, n.DEgenes)
random_high <- as.data.frame(random_high)
random_high$type <- "random"
random_high$type[dim(random_high)[1]] <- "real"
random_high$absFC <- abs(random_high$logFC)
random_high$neglogpval <- -log10(random_high$p.val)
# Merging the two experiments
ds_low <- melt(random_low)
ds_high <- melt(random_high)
ds_low$data_type <- "low_variance"
ds_high$data_type <- "high_variance"
ds_random_sampling <- rbind(ds_low, ds_high)
ds_random_sampling$type <- factor(ds_random_sampling$type, levels = c("real", "random"))
ds_random_sampling <- ds_random_sampling[!ds_random_sampling$variable %in% c("logFC", "p.val"),]
ds_random_sampling$experiment <- "sample_random"
ds_random_sampling$variable <- factor(ds_random_sampling$variable,
levels = c("absFC", "neglogpval", "n.DEgenes"),
labels = c("absolute FC", "-log10p value", "n. DE genes"))
rm(binned_dataset, binned_random, ds_high, ds_low, random_high, random_low, i, logFC, n.DEgenes, n.random, p.val, variance)
ggplot(ds_random_sampling[ds_random_sampling$data_type == "high_variance",], aes(x=experiment, y=value, fill=type)) +
geom_point(shape=21, position = position_dodge2(.4), size=3) +
facet_wrap(~variable, scales = "free") +
theme_bw() + theme(aspect.ratio = 2) +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure 3b and 3c")
ggplot(ds_random_sampling[ds_random_sampling$data_type == "low_variance",], aes(x=experiment, y=value, fill=type)) +
geom_point(shape=21, position = position_dodge2(.4), size=3) +
facet_wrap(~variable, scales = "free") +
theme_bw() + theme(aspect.ratio = 2) +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure S5a and S5b")
We now want to evaluate what is the overlap in the DE genes calculated with an huva experiment across 1593 genes of interest in the 500FG dataset.
Random selection of the 1593 genes used for the experiment.
selection <- sample(rownames(huva.db$FG500$data$PBMC),
round(dim(huva.db$FG500$data$PBMC)[1]*0.1), replace = F)
bias_ds <- huva.db$FG500$data$PBMC
bias_ds <- t(seq(1, 1.1, length.out = dim(bias_ds)[2])*t(bias_ds))
huva_default_bias <- huva.db
huva_default_bias$FG500$data[["PBMC"]] <- bias_ds
rm(bias_ds)
pheatmap(huva.db$FG500$data$PBMC[1:100,],
cluster_rows = F, cluster_cols = F,
scale = "row",
border_color = FALSE,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = "Figure S6b - original dataset",
show_rownames = FALSE,
show_colnames = FALSE,
breaks = seq(-5,5, length.out = 100),
cellwidth = 4, cellheight = 3)
pheatmap(huva_default_bias$FG500$data$PBMC[1:100,],
cluster_rows = F, cluster_cols = F,
scale = "row",
border_color = FALSE,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = "Figure S6b - bias dataset",
show_rownames = FALSE,
show_colnames = FALSE,
breaks = seq(-5,5, length.out = 100),
cellwidth = 4, cellheight = 3)
df_orignal <- melt(huva.db$FG500$data$PBMC)
df_orignal$data_type <- "original"
df_bias <- melt(huva_default_bias$FG500$data$PBMC)
df_bias$data_type <- "bias"
bias_vis_df <- rbind(df_orignal, df_bias)
bias_vis_df$data_type <- factor(bias_vis_df$data_type, levels = c("original", "bias"))
# sample onlx 10 of the sample of the dataset to display
set.seed(1)
ggplot(bias_vis_df[bias_vis_df$Var2 %in% sample(levels(bias_vis_df$Var2), size = 5),], aes(x=Var2, y=value, fill=data_type)) +
geom_boxplot() +
scale_y_continuous(limits = c(0,20)) +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
theme_bw() +
ggtitle("Figure S6c - random selection of 5 samples to show the bias") + theme(aspect.ratio = 2/5)
values <- seq(1, 1.1, length.out = 95)
names(values) <- colnames(huva_default_bias$FG500$data$PBMC)
print(paste("The bias on the samples",
paste(c("HV078", "HV126", "HV259", "HV185", "HV435", "is:"), collapse = ", "),
paste(values[c("HV078", "HV126", "HV259", "HV185", "HV435")], collapse = ", "),
sep = " "))
## [1] "The bias on the samples HV078, HV126, HV259, HV185, HV435, is: 1, 1.03510638297872, 1.04042553191489, 1.07127659574468, 1.09148936170213"
rm(df_bias, df_orignal, bias_vis_df, values)
Here we source the function to calculate the overlap in DE genes. First with the real 500FG datased and second with the biased dataset.
source("./source/DE_overlap_orig.R")
print("Value distribution of the original dataset")
## [1] "Value distribution of the original dataset"
summary(DE_comp_original$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.01427 0.21550 0.41394 1.00000
source("./source/DE_overlap_bias.R")
print("Value distribution of the biased dataset")
## [1] "Value distribution of the biased dataset"
summary(DE_comp_bias$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2446 0.5790 0.5148 0.7861 1.0000
DE_comp_original$data_type <- "original"
DE_comp_bias$data_type <- "bias"
DE_comp_merge <- rbind(DE_comp_original, DE_comp_bias)
DE_comp_merge$data_type <- factor(DE_comp_merge$data_type, levels = c("original", "bias"))
ggplot(DE_comp_merge, aes(x=data_type, y=overlap, fill=data_type)) +
geom_boxplot() +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
theme_bw() + theme(aspect.ratio = 2) +
xlab("")+
ylab("DE genes overlap (fraction)") +
ggtitle("Figure 3e - Overlap DE genes - boxplot")
t.test(DE_comp_merge[DE_comp_merge$data_type=="original",]$overlap,
DE_comp_merge[DE_comp_merge$data_type=="bias",]$overlap, paired = F)
##
## Welch Two Sample t-test
##
## data: DE_comp_merge[DE_comp_merge$data_type == "original", ]$overlap and DE_comp_merge[DE_comp_merge$data_type == "bias", ]$overlap
## t = -1129.8, df = 5019517, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2997777 -0.2987394
## sample estimates:
## mean of x mean of y
## 0.2154973 0.5147559
rm(DE_comp_merge)
tmp <- DE_comp_original_matrix
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
border_color = NA, na_col = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure 3f - Original dataset")
rm(tmp)
tmp <- DE_comp_bias_matrix
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
border_color = NA, na_col = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure 3f - Bias dataset")
rm(tmp)
We now want to evaluate what is Pearson’s correlation between huva experiment across 1593 genes of interest in the 500FG dataset.
Here we source the function for the calculation of the pair-wise correlation coefficient. First for the original 500FG dataset and second for the biased dataset.
NOTE: This calculation required long time for the pair-wise comparisons, we provide the pre calculated tables to reproduce the plot in the manuscript and the original code.
# source("./source/Rank_overlap_orig.R")
rank_comp_origina <- readRDS("./data/rank_comp_orig.rds")
rank_comp_origina_matrix <- readRDS("./data/rank_comp_orig_mat.rds")
print("Value distribution of the original dataset")
## [1] "Value distribution of the original dataset"
summary(rank_comp_origina$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.988899 -0.462729 0.007376 0.004457 0.470226 0.992602
# source("./source/Rank_overlap_bias.R")
rank_comp_bias <- readRDS("./data/rank_comp_bias.rds")
rank_comp_bias_matrix <- readRDS("./data/rank_comp_bias_mat.rds")
print("Value distribution of the bias dataset")
## [1] "Value distribution of the bias dataset"
summary(rank_comp_bias$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9432 -0.1168 0.2969 0.2286 0.6104 0.9948
rank_comp_origina$data_type <- "original"
rank_comp_bias$data_type <- "bias"
rank_comp_merge <- rbind(rank_comp_origina, rank_comp_bias)
rank_comp_merge$data_type <- factor(rank_comp_merge$data_type, levels = c("original", "bias"))
ggplot(rank_comp_merge, aes(x=data_type, y=overlap, fill=data_type)) +
geom_boxplot() +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
theme_bw() + theme(aspect.ratio = 2) +
xlab("")+
ylab("DE genes overlap (fraction)") +
ggtitle("Figure S8b - Overlap DE genes - boxplot")
t.test(rank_comp_merge[rank_comp_merge$data_type=="original",]$overlap,
rank_comp_merge[rank_comp_merge$data_type=="bias",]$overlap, paired = F)
##
## Welch Two Sample t-test
##
## data: rank_comp_merge[rank_comp_merge$data_type == "original", ]$overlap and rank_comp_merge[rank_comp_merge$data_type == "bias", ]$overlap
## t = -509.56, df = 4939704, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2249631 -0.2232392
## sample estimates:
## mean of x mean of y
## 0.004457361 0.228558501
rm(rank_comp_merge)
tmp <- rank_comp_origina_matrix
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
breaks = seq(0, 1, length.out = 50),
border_color = NA, na_col = NA,
main = "Figure S8c - Original dataset")
rm(tmp)
tmp <- rank_comp_bias_matrix
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
breaks = seq(0, 1, length.out = 50),
border_color = NA, na_col = NA,
main = "Figure S8d - Bias dataset")
rm(tmp)
We now want to evaluate in the selection of samples in the HIGH and LOW groups for an huva experiment across 1593 genes of interest in the 500FG dataset.
Here we source the function for the calculation of the overlap. First for the original 500FG dataset and second for the biased dataset.
source("./source/sample_overlap_orig.R")
print("Value distribution of the original dataset - LOW group")
## [1] "Value distribution of the original dataset - LOW group"
summary(samples_comp_original_low$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1000 0.1485 0.2000 1.0000
print("Value distribution of the original dataset - HIGH group")
## [1] "Value distribution of the original dataset - HIGH group"
summary(samples_comp_original_high$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1000 0.1366 0.2000 1.0000
source("./source/sample_overlap_bias.R")
print("Value distribution of the bias dataset - LOW group")
## [1] "Value distribution of the bias dataset - LOW group"
summary(samples_comp_bias_low$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1000 0.2000 0.2336 0.3000 1.0000
print("Value distribution of the bias dataset - HIGH group")
## [1] "Value distribution of the bias dataset - HIGH group"
summary(samples_comp_bias_high$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1000 0.2000 0.2205 0.3000 1.0000
samples_comp_original_low$data_type <- "original"
samples_comp_bias_low$data_type <- "bias"
samples_comp_merge_low <- rbind(samples_comp_original_low, samples_comp_bias_low)
samples_comp_merge_low$data_type <- factor(samples_comp_merge_low$data_type, levels = c("original", "bias"))
ggplot(samples_comp_merge_low, aes(x=data_type, y=overlap, fill=data_type)) +
geom_boxplot() +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
theme_bw() + theme(aspect.ratio = 2) +
xlab("")+
ylab("samples overlap (fraction)") +
ggtitle("Figure S7b - Overlap Samples LOW group - boxplot")
t.test(samples_comp_merge_low[samples_comp_merge_low$data_type=="original",]$overlap,
samples_comp_merge_low[samples_comp_merge_low$data_type=="bias",]$overlap, paired = F)
##
## Welch Two Sample t-test
##
## data: samples_comp_merge_low[samples_comp_merge_low$data_type == "original", ]$overlap and samples_comp_merge_low[samples_comp_merge_low$data_type == "bias", ]$overlap
## t = -634.77, df = 5003559, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08534185 -0.08481646
## sample estimates:
## mean of x mean of y
## 0.1485132 0.2335924
rm(samples_comp_merge_low)
tmp <- samples_comp_original_matrix_low
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
border_color = NA, na_col = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure S7c - Original dataset")
rm(tmp)
tmp <- samples_comp_bias_matrix_low
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
na_col = NA, border_color = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure S7c - Original dataset")
rm(tmp)
samples_comp_original_high$data_type <- "original"
samples_comp_bias_high$data_type <- "bias"
samples_comp_merge_high <- rbind(samples_comp_original_high, samples_comp_bias_high)
samples_comp_merge_high$data_type <- factor(samples_comp_merge_high$data_type, levels = c("original", "bias"))
ggplot(samples_comp_merge_high, aes(x=data_type, y=overlap, fill=data_type)) +
geom_boxplot() +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
theme_bw() + theme(aspect.ratio = 2) +
xlab("")+
ylab("samples overlap (fraction)") +
ggtitle("Figure S7b - Overlap Samples HIGH group - boxplot")
t.test(samples_comp_merge_high[samples_comp_merge_high$data_type=="original",]$overlap,
samples_comp_merge_high[samples_comp_merge_high$data_type=="bias",]$overlap, paired = F)
##
## Welch Two Sample t-test
##
## data: samples_comp_merge_high[samples_comp_merge_high$data_type == "original", ]$overlap and samples_comp_merge_high[samples_comp_merge_high$data_type == "bias", ]$overlap
## t = -638.52, df = 5050152, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08414215 -0.08362718
## sample estimates:
## mean of x mean of y
## 0.1365698 0.2204544
rm(samples_comp_merge_high)
tmp <- samples_comp_original_matrix_high
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
border_color = NA, na_col = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure S7e - Original dataset HIGH group")
rm(tmp)
tmp <- samples_comp_bias_matrix_high
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F,
show_colnames = F,
cellwidth = 0.15,
cellheight = 0.15,
border_color = NA, na_col = NA,
breaks = seq(0, 1, length.out = 50),
main = "Figure S7e - Bias dataset HIGH group")
rm(tmp)
We compare here the result of the overlap of samples when randomly selecting samples
source("./source/sample_overlap_random.R")
summary(sample_comp_random$overlap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1000 0.1053 0.2000 0.7000
ggplot(sample_comp_random, aes(x="random", y=overlap)) +
geom_boxplot(fill = "gray") +
xlab("")+
ylab("samples overlap (fraction)") +
theme_bw() + theme(aspect.ratio = 4) + ggtitle("Randoms Sampling Overlap - data not shown")
set.seed(91)
n.random.sign <- 1000
core <- huva.db::msigdb_V7.2$GSE22935_WT_VS_MYD88_KO_MACROPHAGE_UP
random_sign <- list()
for (i in 1:n.random.sign) {
sign <- sample(rownames(huva.db$FG500$data$PBMC), 186, replace = F)
random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "MYD88",
quantiles = 0.10,
gs_list = random_sign,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
## [1] "Binning on MYD88 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))
rm(i, core, n.random.sign, sign, random_sign)
ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
geom_point(shape=21) +
theme_bw() + theme(aspect.ratio = 2) +
scale_x_continuous(limits = c(-3, 3)) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure 3h - Random gene signatures - MYD88") +
scale_y_continuous(limits = c(0,3))
rm(binned_dataset)
set.seed(91)
n.random.sign <- 1000
core <- huva.db::msigdb_V7.2$AKT_UP.V1_UP
random_sign <- list()
for (i in 1:n.random.sign) {
sign <- sample(rownames(huva.db$FG500$data$PBMC), 138, replace = F)
random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "AKT1",
quantiles = 0.10,
gs_list = random_sign,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
## [1] "Binning on AKT1 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))
rm(i, core, n.random.sign, sign, random_sign)
ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
geom_point(shape=21) +
theme_bw() + theme(aspect.ratio = 2) +
scale_x_continuous(limits = c(-3, 3)) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure S9a - Random gene signatures - AKT") +
scale_y_continuous(limits = c(0,3))
rm(binned_dataset)
set.seed(91)
n.random.sign <- 1000
core <- huva.db::msigdb_V7.2$GO_ERK1_AND_ERK2_CASCADE
random_sign <- list()
for (i in 1:n.random.sign) {
sign <- sample(rownames(huva.db$FG500$data$PBMC), 219, replace = F)
random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "MAPK3",
quantiles = 0.10,
gs_list = random_sign,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
## [1] "Binning on MAPK3 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))
rm(i, core, n.random.sign, sign, random_sign)
ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
geom_point(shape=21) +
theme_bw() + theme(aspect.ratio = 2) +
scale_x_continuous(limits = c(-3, 3)) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure S9b - Random gene signatures - ERK") +
scale_y_continuous(limits = c(0,3))
rm(binned_dataset)
set.seed(91)
n.random.sign <- 1000
core <- huva.db::msigdb_V7.2$GSE40666_WT_VS_STAT1_KO_CD8_TCELL_UP
random_sign <- list()
for (i in 1:n.random.sign) {
sign <- sample(rownames(huva.db$FG500$data$PBMC), 195, replace = F)
random_sign[[paste("random nr. ", i, sep = "")]] <- sign
}
random_sign[["core"]] <- core
binned_dataset <- run_huva_experiment(data = huva.db,
gene = "STAT1",
quantiles = 0.10,
gs_list = random_sign,
summ = F,
datasets_list = c("FG500"),
adjust.method = "BH")
## [1] "Binning on STAT1 expression"
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- c(rep("random", n.random.sign), "real")
binned_dataset$FG500$gsea$FG500_PBMC$data_type <- factor(binned_dataset$FG500$gsea$FG500_PBMC$data_type, levels = c("real", "random"))
rm(i, core, n.random.sign, sign, random_sign)
ggplot(binned_dataset$FG500$gsea$FG500_PBMC, aes(x=NES, y=-log10(pval), size=-log10(pval), fill=data_type))+
geom_point(shape=21) +
theme_bw() + theme(aspect.ratio = 2) +
scale_x_continuous(limits = c(-3, 3)) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c("#BC3C29", "#01A087")) +
ggtitle("Figure S9c - Random gene signatures - STAT1") +
scale_y_continuous(limits = c(0,3))
rm(binned_dataset)
set.seed(91)
MyD88_quant <- optimal_quant(gene_name = "MYD88", FC.Cut = 0)
## Warning: Zero sample variances detected, have been offset away from zero
MyD88_quant$plot_FC +
ggtitle("Fig. S9e - Quantile selection log2 FC") +
xlab("quantile") +
ylab("log2 fold change") +
scale_fill_manual(values = c("#BC3C29", "#01A087"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
MyD88_quant$plot_adj.P.val +
ggtitle("Fig. S9f - Quantile selection -log10 p value") +
xlab("quantile") +
ylab("-log10 p value") +
scale_fill_manual(values = c("#BC3C29", "#01A087"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
MyD88_quant$plot_result +
scale_x_continuous(limits = c(0.085, 0.4)) +
scale_y_continuous(limits = c(0.3, 0.9), breaks = c(0.3, 0.45, 0.6, 0.75, 0.9)) +
geom_smooth(se = FALSE) +
ylab("-log10 p value") +
xlab("log2 fold change") +
ggtitle("Fig. S9g - Quantile selection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(MyD88_quant$result, aes(x=logFC, y=n.DE)) +
geom_point(shape = 21, size = 3) + theme_bw() +
geom_smooth(se = FALSE) + theme(aspect.ratio = 1/5) +
scale_x_continuous(limits = c(0.085, 0.4)) +
scale_y_continuous(limits = c(0, 8000)) +
ggtitle("Fig. S9g - Quantile selection - Top") +
xlab("log2 fold change")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(MyD88_quant$result, aes(x=-log10(adj.P.val), y=n.DE)) +
geom_point(shape = 21, size = 3) + theme_bw() +
geom_smooth(se = FALSE) + theme(aspect.ratio = 1/5) +
scale_x_continuous(limits = c(0.3, 0.9), breaks = c(0.3, 0.45, 0.6, 0.75, 0.9)) +
scale_y_continuous(limits = c(0, 8000)) +
ggtitle("Fig. S9g - Quantile selection - Side") +
xlab("-log10 p value")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
### Similarities in the gene ranks
source("./source/Rank_overlap_quant.R")
summary(quantile_overlap_MyD88$correlation)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8274 0.9361 0.9654 0.9584 0.9850 1.0000
ggplot(quantile_overlap_MyD88, aes(x=experiment, y=correlation)) +
geom_boxplot(outlier.colour = "black", fill = "#C2C2C2") +
geom_point(position = position_dodge2(0.4), shape=21, alpha=0.0005) +
theme_bw() + theme(aspect.ratio = 4) +
ggtitle("Fig. S9h - Pearson's correlation") +
scale_y_continuous(limits = c(0,1)) +
xlab("") +
ylab("rank-rank Pearson's correlation (r)")
tmp <- quantile_overlap_MyD88_matrix
for (i in 2:dim(tmp)[2]) {
tmp[1:(i-1),i] <- NA
}
pheatmap(tmp,
scale = "none",
cluster_rows = F,
cluster_cols = F,
color = inferno(direction = -1, 50),
show_rownames = F, show_colnames = F,
cellwidth = 4, cellheight = 4,
breaks = seq(0.7,1,length.out = 50),
border_color = NA, na_col = NA,
main = "Fig. S9h -Pearson's correlation")
rm(tmp, i)
info <- sessionInfo()
info
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggsci_2.9 foreach_1.5.1 viridis_0.5.1
## [4] viridisLite_0.3.0 pheatmap_1.0.12 ggpubr_0.4.0
## [7] Rmisc_1.5 plyr_1.8.6 lattice_0.20-41
## [10] fgsea_1.12.0 Rcpp_1.0.5 limma_3.46.0
## [13] reshape2_1.4.4 forcats_0.5.0 stringr_1.4.0
## [16] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0
## [19] tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0
## [22] ggplot2_3.3.2 huva.db_0.1.4-2 huva_0.1.4
## [25] BiocManager_1.30.10
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ggsignif_0.6.0
## [3] ellipsis_0.3.1 rio_0.5.16
## [5] XVector_0.30.0 fs_1.5.0
## [7] GenomicRanges_1.42.0 rstudioapi_0.13
## [9] farver_2.0.3 bit64_4.0.5
## [11] fansi_0.4.1 AnnotationDbi_1.52.0
## [13] lubridate_1.7.9.2 xml2_1.3.2
## [15] splines_4.0.3 codetools_0.2-18
## [17] knitr_1.30 jsonlite_1.7.2
## [19] broom_0.7.3 annotate_1.68.0
## [21] dbplyr_2.0.0 graph_1.68.0
## [23] compiler_4.0.3 httr_1.4.2
## [25] backports_1.2.1 assertthat_0.2.1
## [27] Matrix_1.3-0 lazyeval_0.2.2
## [29] cli_2.2.0 htmltools_0.5.0
## [31] tools_4.0.3 gtable_0.3.0
## [33] glue_1.4.2 GenomeInfoDbData_1.2.4
## [35] fastmatch_1.1-0 carData_3.0-4
## [37] Biobase_2.50.0 cellranger_1.1.0
## [39] vctrs_0.3.6 nlme_3.1-151
## [41] iterators_1.0.13 xfun_0.19
## [43] rvest_0.3.6 openxlsx_4.2.3
## [45] lifecycle_0.2.0 rstatix_0.6.0
## [47] XML_3.99-0.5 zlibbioc_1.36.0
## [49] scales_1.1.1 hms_0.5.3
## [51] MatrixGenerics_1.2.0 parallel_4.0.3
## [53] SummarizedExperiment_1.20.0 RColorBrewer_1.1-2
## [55] yaml_2.2.1 curl_4.3
## [57] memoise_1.1.0 gridExtra_2.3
## [59] stringi_1.5.3 RSQLite_2.2.1
## [61] GSVA_1.38.0 S4Vectors_0.28.1
## [63] BiocGenerics_0.36.0 zip_2.1.1
## [65] BiocParallel_1.24.1 GenomeInfoDb_1.26.2
## [67] rlang_0.4.9 pkgconfig_2.0.3
## [69] matrixStats_0.57.0 bitops_1.0-6
## [71] evaluate_0.14 labeling_0.4.2
## [73] htmlwidgets_1.5.3 bit_4.0.4
## [75] tidyselect_1.1.0 GSEABase_1.52.1
## [77] magrittr_2.0.1 R6_2.5.0
## [79] IRanges_2.24.1 generics_0.1.0
## [81] DelayedArray_0.16.0 DBI_1.1.0
## [83] mgcv_1.8-33 pillar_1.4.7
## [85] haven_2.3.1 foreign_0.8-81
## [87] withr_2.3.0 abind_1.4-5
## [89] RCurl_1.98-1.2 modelr_0.1.8
## [91] crayon_1.3.4 car_3.0-10
## [93] plotly_4.9.2.2 rmarkdown_2.6
## [95] grid_4.0.3 readxl_1.3.1
## [97] data.table_1.13.4 blob_1.2.1
## [99] reprex_0.3.0 digest_0.6.27
## [101] xtable_1.8-4 stats4_4.0.3
## [103] munsell_0.5.0
save.image(paste("./data/",Sys.Date(), "_huva_Figure_3.RData", sep = ""))